#import libraries
from __future__ import division
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
import plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
Reading data from CSV file
tx_data = pd.read_csv('data.csv', encoding='cp1252')
The code initializes the Plotly library for displaying interactive visualizations in a notebook. It then displays the first few rows of the "tx_data" DataFrame.
pyoff.init_notebook_mode()
tx_data.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 12/1/2010 8:26 | 2.55 | 17850.0 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 12/1/2010 8:26 | 2.75 | 17850.0 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 12/1/2010 8:26 | 3.39 | 17850.0 | United Kingdom |
The code converts the "InvoiceDate" column in the "tx_data" DataFrame to a datetime data type using the pandas "to_datetime" function.
tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])
The code creates a new column called "InvoiceYearMonth" in the "tx_data" DataFrame, which combines the year and month information from the "InvoiceDate" column into a single numerical value.
tx_data['InvoiceYearMonth'] = tx_data['InvoiceDate'].map(lambda date: 100*date.year + date.month)
The code computes descriptive statistics for the numerical columns in the "tx_data" DataFrame, such as count, mean, standard deviation, minimum, maximum, and quartile values.
tx_data.describe()
| Quantity | UnitPrice | CustomerID | InvoiceYearMonth | |
|---|---|---|---|---|
| count | 541909.000000 | 541909.000000 | 406829.000000 | 541909.000000 |
| mean | 9.552250 | 4.611114 | 15287.690570 | 201099.713989 |
| std | 218.081158 | 96.759853 | 1713.600303 | 25.788703 |
| min | -80995.000000 | -11062.060000 | 12346.000000 | 201012.000000 |
| 25% | 1.000000 | 1.250000 | 13953.000000 | 201103.000000 |
| 50% | 3.000000 | 2.080000 | 15152.000000 | 201107.000000 |
| 75% | 10.000000 | 4.130000 | 16791.000000 | 201110.000000 |
| max | 80995.000000 | 38970.000000 | 18287.000000 | 201112.000000 |
The code counts the number of occurrences of each unique value in the "Country" column of the "tx_data" DataFrame and returns the count for each country.
tx_data['Country'].value_counts()
United Kingdom 495478 Germany 9495 France 8557 EIRE 8196 Spain 2533 Netherlands 2371 Belgium 2069 Switzerland 2002 Portugal 1519 Australia 1259 Norway 1086 Italy 803 Channel Islands 758 Finland 695 Cyprus 622 Sweden 462 Unspecified 446 Austria 401 Denmark 389 Japan 358 Poland 341 Israel 297 USA 291 Hong Kong 288 Singapore 229 Iceland 182 Canada 151 Greece 146 Malta 127 United Arab Emirates 68 European Community 61 RSA 58 Lebanon 45 Lithuania 35 Brazil 32 Czech Republic 30 Bahrain 19 Saudi Arabia 10 Name: Country, dtype: int64
The code filters the "tx_data" DataFrame to create a new DataFrame called "tx_uk" that contains only the rows where the "Country" column is equal to 'United Kingdom', and resets the index of the new DataFrame.
tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True)
The code creates a new DataFrame called "tx_user" that contains unique values from the "CustomerID" column of the "tx_data" DataFrame. It then renames the column to 'CustomerID' and displays the first few rows of the new DataFrame.
tx_user = pd.DataFrame(tx_data['CustomerID'].unique())
tx_user.columns = ['CustomerID']
tx_user.head()
| CustomerID | |
|---|---|
| 0 | 17850.0 |
| 1 | 13047.0 |
| 2 | 12583.0 |
| 3 | 13748.0 |
| 4 | 15100.0 |
The code displays the first few rows of the "tx_uk" DataFrame, which contains the transactions data specifically for customers in the United Kingdom.
tx_uk.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | InvoiceYearMonth | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom | 201012 |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom | 201012 |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 |
The code calculates the maximum purchase date for each customer in the "tx_uk" DataFrame and stores the results in the "tx_max_purchase" DataFrame with columns "CustomerID" and "MaxPurchaseDate". It then displays the first few rows of the "tx_max_purchase" DataFrame.
tx_max_purchase = tx_uk.groupby('CustomerID').InvoiceDate.max().reset_index()
tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate']
tx_max_purchase.head()
| CustomerID | MaxPurchaseDate | |
|---|---|---|
| 0 | 12346.0 | 2011-01-18 10:17:00 |
| 1 | 12747.0 | 2011-12-07 14:34:00 |
| 2 | 12748.0 | 2011-12-09 12:20:00 |
| 3 | 12749.0 | 2011-12-06 09:56:00 |
| 4 | 12820.0 | 2011-12-06 15:12:00 |
The code calculates the recency (number of days since the last purchase) for each customer in the "tx_max_purchase" DataFrame and adds the results as a new column called "Recency". It then displays the first few rows of the updated DataFrame.
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
tx_max_purchase.head()
| CustomerID | MaxPurchaseDate | Recency | |
|---|---|---|---|
| 0 | 12346.0 | 2011-01-18 10:17:00 | 325 |
| 1 | 12747.0 | 2011-12-07 14:34:00 | 1 |
| 2 | 12748.0 | 2011-12-09 12:20:00 | 0 |
| 3 | 12749.0 | 2011-12-06 09:56:00 | 3 |
| 4 | 12820.0 | 2011-12-06 15:12:00 | 2 |
The code merges the "tx_user" DataFrame with the "tx_max_purchase" DataFrame based on the common column "CustomerID" and adds the "Recency" column from the "tx_max_purchase" DataFrame to the "tx_user" DataFrame. It then displays the first few rows of the updated DataFrame.
tx_user = pd.merge(tx_user, tx_max_purchase[['CustomerID','Recency']], on='CustomerID')
tx_user.head()
| CustomerID | Recency | |
|---|---|---|
| 0 | 17850.0 | 301 |
| 1 | 13047.0 | 31 |
| 2 | 13748.0 | 95 |
| 3 | 15100.0 | 329 |
| 4 | 15291.0 | 25 |
The code performs K-means clustering on the "Recency" feature of the "tx_user" DataFrame. It iterates over different numbers of clusters and calculates the sum of squared errors (SSE) for each cluster configuration. It then plots the SSE values against the number of clusters to help determine the optimal number of clusters to use.
from sklearn.cluster import KMeans
sse={} # error
tx_recency = tx_user[['Recency']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
tx_recency["clusters"] = kmeans.labels_ #cluster names corresponding to recency values
sse[k] = kmeans.inertia_ #sse corresponding to clusters
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
The code performs K-means clustering with 4 clusters on the "Recency" feature of the "tx_user" DataFrame. It assigns each data point to a cluster based on its recency value.
kmeans = KMeans(n_clusters=4)
tx_user['RecencyCluster'] = kmeans.fit_predict(tx_user[['Recency']])
tx_user.head()
| CustomerID | Recency | RecencyCluster | |
|---|---|---|---|
| 0 | 17850.0 | 301 | 1 |
| 1 | 13047.0 | 31 | 0 |
| 2 | 13748.0 | 95 | 3 |
| 3 | 15100.0 | 329 | 1 |
| 4 | 15291.0 | 25 | 0 |
The code groups the data in the "tx_user" DataFrame based on the "RecencyCluster" column and calculates descriptive statistics (count, mean, standard deviation, minimum, quartiles, and maximum) for the "Recency" column within each cluster.
tx_user.groupby('RecencyCluster')['Recency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RecencyCluster | ||||||||
| 0 | 1950.0 | 17.488205 | 13.237058 | 0.0 | 6.00 | 16.0 | 28.00 | 47.0 |
| 1 | 478.0 | 304.393305 | 41.183489 | 245.0 | 266.25 | 300.0 | 336.00 | 373.0 |
| 2 | 568.0 | 184.625000 | 31.753602 | 132.0 | 156.75 | 184.0 | 211.25 | 244.0 |
| 3 | 954.0 | 77.679245 | 22.850898 | 48.0 | 59.00 | 72.5 | 93.00 | 131.0 |
The code defines a function called "order_cluster" that reorders the clusters based on a target field in a DataFrame. In this specific case, it reorders the "RecencyCluster" column in the "tx_user" DataFrame based on the "Recency" column in descending order. The function returns the DataFrame with the reordered clusters.
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)
tx_user.head()
| CustomerID | Recency | RecencyCluster | |
|---|---|---|---|
| 0 | 17850.0 | 301 | 0 |
| 1 | 15100.0 | 329 | 0 |
| 2 | 18074.0 | 373 | 0 |
| 3 | 16250.0 | 260 | 0 |
| 4 | 13747.0 | 373 | 0 |
The code groups the "tx_user" DataFrame by the "RecencyCluster" column and calculates descriptive statistics (count, mean, standard deviation, minimum, quartiles, and maximum) for the "Recency" column within each cluster.
tx_user.groupby('RecencyCluster')['Recency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RecencyCluster | ||||||||
| 0 | 478.0 | 304.393305 | 41.183489 | 245.0 | 266.25 | 300.0 | 336.00 | 373.0 |
| 1 | 568.0 | 184.625000 | 31.753602 | 132.0 | 156.75 | 184.0 | 211.25 | 244.0 |
| 2 | 954.0 | 77.679245 | 22.850898 | 48.0 | 59.00 | 72.5 | 93.00 | 131.0 |
| 3 | 1950.0 | 17.488205 | 13.237058 | 0.0 | 6.00 | 16.0 | 28.00 | 47.0 |
The code calculates the frequency of purchases for each customer in the "tx_uk" DataFrame by counting the number of unique invoice dates per customer. The results are stored in the "tx_frequency" DataFrame with columns "CustomerID" and "Frequency".
tx_frequency = tx_uk.groupby('CustomerID').InvoiceDate.count().reset_index()
tx_frequency.columns = ['CustomerID','Frequency']
tx_frequency.head()
| CustomerID | Frequency | |
|---|---|---|
| 0 | 12346.0 | 2 |
| 1 | 12747.0 | 103 |
| 2 | 12748.0 | 4642 |
| 3 | 12749.0 | 231 |
| 4 | 12820.0 | 59 |
The code merges the "tx_user" DataFrame with the "tx_frequency" DataFrame based on the "CustomerID" column, combining the customer information with their corresponding purchase frequency. The updated "tx_user" DataFrame is then displayed using the head() function.
tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID')
tx_user.head()
| CustomerID | Recency | RecencyCluster | Frequency | |
|---|---|---|---|---|
| 0 | 17850.0 | 301 | 0 | 312 |
| 1 | 15100.0 | 329 | 0 | 6 |
| 2 | 18074.0 | 373 | 0 | 13 |
| 3 | 16250.0 | 260 | 0 | 24 |
| 4 | 13747.0 | 373 | 0 | 1 |
The code performs K-means clustering on the "Frequency" feature of the customers in order to determine an optimal number of clusters. It calculates the sum of squared errors (SSE) for different numbers of clusters and visualizes it using a line plot. The plot helps in identifying the appropriate number of clusters based on the "elbow" point.
from sklearn.cluster import KMeans
sse={} # error
tx_recency = tx_user[['Frequency']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
tx_recency["clusters"] = kmeans.labels_ #cluster names corresponding to recency values
sse[k] = kmeans.inertia_ #sse corresponding to clusters
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
The code applies K-means clustering to the "Frequency" feature of the customers, creating a new column called "FrequencyCluster" that assigns each customer to a cluster. The function order_cluster() is used to order the clusters based on the average frequency. The code then displays the descriptive statistics of the frequency for each cluster.
# Applying k-Means
kmeans=KMeans(n_clusters=4)
tx_user['FrequencyCluster']=kmeans.fit_predict(tx_user[['Frequency']])
#order the frequency cluster
tx_user = order_cluster('FrequencyCluster', 'Frequency', tx_user, True )
tx_user.groupby('FrequencyCluster')['Frequency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| FrequencyCluster | ||||||||
| 0 | 3496.0 | 49.525744 | 44.954212 | 1.0 | 15.0 | 33.0 | 73.0 | 190.0 |
| 1 | 429.0 | 331.221445 | 133.856510 | 191.0 | 228.0 | 287.0 | 399.0 | 803.0 |
| 2 | 22.0 | 1313.136364 | 505.934524 | 872.0 | 988.5 | 1140.0 | 1452.0 | 2782.0 |
| 3 | 3.0 | 5917.666667 | 1805.062418 | 4642.0 | 4885.0 | 5128.0 | 6555.5 | 7983.0 |
The code calculates the revenue for each customer by multiplying the unit price with the quantity of items purchased. It then groups the data by customer ID and calculates the sum of revenue for each customer.
#calculate revenue for each customer
tx_uk['Revenue'] = tx_uk['UnitPrice'] * tx_uk['Quantity']
tx_revenue = tx_uk.groupby('CustomerID').Revenue.sum().reset_index()
tx_revenue.head()
| CustomerID | Revenue | |
|---|---|---|
| 0 | 12346.0 | 0.00 |
| 1 | 12747.0 | 4196.01 |
| 2 | 12748.0 | 29072.10 |
| 3 | 12749.0 | 3868.20 |
| 4 | 12820.0 | 942.34 |
The code merges the revenue data with the main dataframe 'tx_user' based on the common column 'CustomerID'. It adds the revenue information to the 'tx_user' dataframe and displays the first few rows.
#merge it with our main dataframe
tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID')
tx_user.head()
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | |
|---|---|---|---|---|---|---|
| 0 | 17850.0 | 301 | 0 | 312 | 1 | 5288.63 |
| 1 | 15808.0 | 305 | 0 | 210 | 1 | 3724.77 |
| 2 | 13047.0 | 31 | 3 | 196 | 1 | 3079.10 |
| 3 | 14688.0 | 7 | 3 | 359 | 1 | 5107.38 |
| 4 | 16029.0 | 38 | 3 | 274 | 1 | 50992.61 |
Elbow method to find out the optimum number of clusters for K-Means
The code performs k-means clustering on the revenue data in order to determine the optimal number of clusters. It calculates the sum of squared errors (SSE) for different numbers of clusters and plots the SSE values against the number of clusters to help visualize the elbow point, indicating the optimal number of clusters.
from sklearn.cluster import KMeans
sse={} # error
tx_recency = tx_user[['Revenue']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
tx_recency["clusters"] = kmeans.labels_ #cluster names corresponding to recency values
sse[k] = kmeans.inertia_ #sse corresponding to clusters
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
The code applies k-means clustering on the revenue data to assign customers to different revenue clusters. It then orders the cluster numbers based on the revenue values and displays the summary statistics of each revenue cluster.
#apply clustering
kmeans = KMeans(n_clusters=4)
tx_user['RevenueCluster'] = kmeans.fit_predict(tx_user[['Revenue']])
#order the cluster numbers
tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True)
#show details of the dataframe
tx_user.groupby('RevenueCluster')['Revenue'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RevenueCluster | ||||||||
| 0 | 3687.0 | 907.254414 | 921.910820 | -4287.63 | 263.115 | 572.56 | 1258.220 | 4314.72 |
| 1 | 234.0 | 7760.699530 | 3637.173671 | 4330.67 | 5161.485 | 6549.38 | 9142.305 | 21535.90 |
| 2 | 27.0 | 43070.445185 | 15939.249588 | 25748.35 | 28865.490 | 36351.42 | 53489.790 | 88125.38 |
| 3 | 2.0 | 221960.330000 | 48759.481478 | 187482.17 | 204721.250 | 221960.33 | 239199.410 | 256438.49 |
5. Overall Score based on RFM Clsutering
The code calculates the overall score for each customer by summing the values of their recency, frequency, and revenue clusters. It then uses the mean function to display the average recency, frequency, and revenue values for each overall score.
#calculate overall score and use mean() to see details
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
tx_user.groupby('OverallScore')['Recency','Frequency','Revenue'].mean()
| Recency | Frequency | Revenue | |
|---|---|---|---|
| OverallScore | |||
| 0 | 304.584388 | 21.995781 | 303.339705 |
| 1 | 185.362989 | 32.596085 | 498.087546 |
| 2 | 78.991304 | 46.963043 | 868.082991 |
| 3 | 20.689610 | 68.419590 | 1091.416414 |
| 4 | 14.892617 | 271.755034 | 3607.097114 |
| 5 | 9.662162 | 373.290541 | 9136.946014 |
| 6 | 7.740741 | 876.037037 | 22777.914815 |
| 7 | 1.857143 | 1272.714286 | 103954.025714 |
| 8 | 1.333333 | 5917.666667 | 42177.930000 |
The code assigns a segment label to each customer based on their overall score. Customers with an overall score greater than 2 are labeled as "Mid-Value", and customers with an overall score greater than 4 are labeled as "High-Value". The remaining customers are labeled as "Low-Value".
tx_user['Segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore']>2,'Segment'] = 'Mid-Value'
tx_user.loc[tx_user['OverallScore']>4,'Segment'] = 'High-Value'
tx_user
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 17850.0 | 301 | 0 | 312 | 1 | 5288.63 | 1 | 2 | Low-Value |
| 1 | 14688.0 | 7 | 3 | 359 | 1 | 5107.38 | 1 | 5 | High-Value |
| 2 | 13767.0 | 1 | 3 | 399 | 1 | 16945.71 | 1 | 5 | High-Value |
| 3 | 15513.0 | 30 | 3 | 314 | 1 | 14520.08 | 1 | 5 | High-Value |
| 4 | 14849.0 | 21 | 3 | 392 | 1 | 7904.28 | 1 | 5 | High-Value |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3945 | 12748.0 | 0 | 3 | 4642 | 3 | 29072.10 | 2 | 8 | High-Value |
| 3946 | 17841.0 | 1 | 3 | 7983 | 3 | 40340.78 | 2 | 8 | High-Value |
| 3947 | 14096.0 | 3 | 3 | 5128 | 3 | 57120.91 | 2 | 8 | High-Value |
| 3948 | 17450.0 | 7 | 3 | 351 | 1 | 187482.17 | 3 | 7 | High-Value |
| 3949 | 18102.0 | 0 | 3 | 433 | 1 | 256438.49 | 3 | 7 | High-Value |
3950 rows × 9 columns
tx_uk.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | InvoiceYearMonth | Revenue | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom | 201012 | 15.30 |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom | 201012 | 22.00 |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 |
The code provides a summary of the invoice dates in the dataset, including the count, mean, minimum, maximum, and quartile values.
tx_uk['InvoiceDate'].describe()
count 495478 unique 21220 top 2011-10-31 14:41:00 freq 1114 first 2010-12-01 08:26:00 last 2011-12-09 12:49:00 Name: InvoiceDate, dtype: object
The code filters the dataset tx_uk to create a subset tx_3m containing data from a 3-month time period (March 2011 to May 2011) and another subset tx_6m containing data from a 6-month time period (June 2011 to November 2011). The subsets are reset to have a new index.
tx_3m = tx_uk[(tx_uk.InvoiceDate.dt.date < date(2011,6,1)) & (tx_uk.InvoiceDate.dt.date >= date(2011,3,1))].reset_index(drop=True) #3 months time
tx_6m = tx_uk[(tx_uk.InvoiceDate.dt.date >= date(2011,6,1)) & (tx_uk.InvoiceDate.dt.date < date(2011,12,1))].reset_index(drop=True) # 6 months time
The code calculates the revenue for each customer in the 6-month subset tx_6m by multiplying the unit price with the quantity and creates a new dataframe tx_user_6m that aggregates the revenue for each customer by summing it up. The columns are then renamed to 'CustomerID' and 'm6_Revenue'.
#calculate revenue and create a new dataframe for it
tx_6m['Revenue'] = tx_6m['UnitPrice'] * tx_6m['Quantity']
tx_user_6m = tx_6m.groupby('CustomerID')['Revenue'].sum().reset_index()
tx_user_6m.columns = ['CustomerID','m6_Revenue']
tx_user_6m.head()
| CustomerID | m6_Revenue | |
|---|---|---|
| 0 | 12747.0 | 1666.11 |
| 1 | 12748.0 | 18679.01 |
| 2 | 12749.0 | 2323.04 |
| 3 | 12820.0 | 561.53 |
| 4 | 12822.0 | 918.98 |
The code generates a histogram plot of the 6-month revenue (m6_Revenue) for customers. The revenue values are plotted on the x-axis, and the frequency of occurrence is plotted on the y-axis.
#plot LTV histogram
plot_data = [
go.Histogram(
x=tx_user_6m['m6_Revenue']
)
]
plot_layout = go.Layout(
title='6m Revenue'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
tx_user.head()
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 17850.0 | 301 | 0 | 312 | 1 | 5288.63 | 1 | 2 | Low-Value |
| 1 | 14688.0 | 7 | 3 | 359 | 1 | 5107.38 | 1 | 5 | High-Value |
| 2 | 13767.0 | 1 | 3 | 399 | 1 | 16945.71 | 1 | 5 | High-Value |
| 3 | 15513.0 | 30 | 3 | 314 | 1 | 14520.08 | 1 | 5 | High-Value |
| 4 | 14849.0 | 21 | 3 | 392 | 1 | 7904.28 | 1 | 5 | High-Value |
tx_uk.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | InvoiceYearMonth | Revenue | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom | 201012 | 15.30 |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom | 201012 | 22.00 |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 |
The code merges the tx_user DataFrame with the tx_user_6m DataFrame based on the 'CustomerID' column, including only the customers who are present in the tx_user_6m timeline.
tx_merge = pd.merge(tx_user, tx_user_6m, on='CustomerID', how='left') #Only people who are in the timeline of tx_user_6m
The code fills any missing values in the tx_merge DataFrame with zeros.
tx_merge = tx_merge.fillna(0)
The code generates a scatter plot to visualize the relationship between the 6-month revenue (LTV) and the RFM score for different customer segments (low-value, mid-value, and high-value). Each segment is represented by a different color and marker size in the plot.
tx_graph = tx_merge.query("m6_Revenue < 50000") #because max values are ending at 50,000 as seen in graph above
plot_data = [
go.Scatter(
x=tx_graph.query("Segment == 'Low-Value'")['OverallScore'],
y=tx_graph.query("Segment == 'Low-Value'")['m6_Revenue'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=tx_graph.query("Segment == 'Mid-Value'")['OverallScore'],
y=tx_graph.query("Segment == 'Mid-Value'")['m6_Revenue'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=tx_graph.query("Segment == 'High-Value'")['OverallScore'],
y=tx_graph.query("Segment == 'High-Value'")['m6_Revenue'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "6m LTV"},
xaxis= {'title': "RFM Score"},
title='LTV'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
The code removes outliers from the dataset by filtering out the records where the 6-month revenue (LTV) is above the 99th percentile.
#remove outliers
tx_merge = tx_merge[tx_merge['m6_Revenue']<tx_merge['m6_Revenue'].quantile(0.99)]
tx_merge.head()
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment | m6_Revenue | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17850.0 | 301 | 0 | 312 | 1 | 5288.63 | 1 | 2 | Low-Value | 0.00 |
| 1 | 14688.0 | 7 | 3 | 359 | 1 | 5107.38 | 1 | 5 | High-Value | 1702.06 |
| 4 | 14849.0 | 21 | 3 | 392 | 1 | 7904.28 | 1 | 5 | High-Value | 5498.07 |
| 6 | 13468.0 | 1 | 3 | 306 | 1 | 5656.75 | 1 | 5 | High-Value | 1813.09 |
| 7 | 17690.0 | 29 | 3 | 258 | 1 | 4748.45 | 1 | 5 | High-Value | 2616.15 |
The code performs K-means clustering on the 6-month revenue (LTV) values to create three clusters. The cluster labels are assigned to the 'LTVCluster' column in the merged dataset.
#creating 3 clusters
kmeans = KMeans(n_clusters=3)
tx_merge['LTVCluster'] = kmeans.fit_predict(tx_merge[['m6_Revenue']])
tx_merge.head()
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment | m6_Revenue | LTVCluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17850.0 | 301 | 0 | 312 | 1 | 5288.63 | 1 | 2 | Low-Value | 0.00 | 0 |
| 1 | 14688.0 | 7 | 3 | 359 | 1 | 5107.38 | 1 | 5 | High-Value | 1702.06 | 2 |
| 4 | 14849.0 | 21 | 3 | 392 | 1 | 7904.28 | 1 | 5 | High-Value | 5498.07 | 1 |
| 6 | 13468.0 | 1 | 3 | 306 | 1 | 5656.75 | 1 | 5 | High-Value | 1813.09 | 2 |
| 7 | 17690.0 | 29 | 3 | 258 | 1 | 4748.45 | 1 | 5 | High-Value | 2616.15 | 2 |
The code orders the cluster numbers based on the ascending order of the 6-month revenue (LTV) values. It then creates a new cluster dataframe called 'tx_cluster' and displays the statistical details of each cluster's 6-month revenue.
#order cluster number based on LTV
tx_merge = order_cluster('LTVCluster', 'm6_Revenue',tx_merge,True)
#creatinga new cluster dataframe
tx_cluster = tx_merge.copy()
#see details of the clusters
tx_cluster.groupby('LTVCluster')['m6_Revenue'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| LTVCluster | ||||||||
| 0 | 2966.0 | 278.645503 | 283.346503 | -4287.63 | 0.000 | 230.19 | 454.8125 | 945.58 |
| 1 | 790.0 | 1618.495038 | 553.023271 | 947.05 | 1158.275 | 1489.53 | 1960.8925 | 3134.08 |
| 2 | 154.0 | 4665.323961 | 1343.190242 | 3143.27 | 3557.960 | 4273.34 | 5498.0400 | 8432.68 |
tx_cluster.head()
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment | m6_Revenue | LTVCluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17850.0 | 301 | 0 | 312 | 1 | 5288.63 | 1 | 2 | Low-Value | 0.0 | 0 |
| 1 | 13093.0 | 266 | 0 | 170 | 0 | 7741.47 | 1 | 1 | Low-Value | 0.0 | 0 |
| 2 | 15032.0 | 255 | 0 | 55 | 0 | 4464.10 | 1 | 1 | Low-Value | 0.0 | 0 |
| 3 | 16000.0 | 2 | 3 | 9 | 0 | 12393.70 | 1 | 4 | Mid-Value | 0.0 | 0 |
| 4 | 15749.0 | 234 | 1 | 15 | 0 | 21535.90 | 1 | 2 | Low-Value | 0.0 | 0 |
The code converts the categorical variable 'Segment' in the 'tx_cluster' dataframe into numerical values using one-hot encoding, resulting in a new dataframe called 'tx_class'.
#convert categorical columns to numerical
tx_class = pd.get_dummies(tx_cluster) #There is only one categorical variable segment
tx_class.head()
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | m6_Revenue | LTVCluster | Segment_High-Value | Segment_Low-Value | Segment_Mid-Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17850.0 | 301 | 0 | 312 | 1 | 5288.63 | 1 | 2 | 0.0 | 0 | 0 | 1 | 0 |
| 1 | 13093.0 | 266 | 0 | 170 | 0 | 7741.47 | 1 | 1 | 0.0 | 0 | 0 | 1 | 0 |
| 2 | 15032.0 | 255 | 0 | 55 | 0 | 4464.10 | 1 | 1 | 0.0 | 0 | 0 | 1 | 0 |
| 3 | 16000.0 | 2 | 3 | 9 | 0 | 12393.70 | 1 | 4 | 0.0 | 0 | 0 | 0 | 1 |
| 4 | 15749.0 | 234 | 1 | 15 | 0 | 21535.90 | 1 | 2 | 0.0 | 0 | 0 | 1 | 0 |
The code calculates the correlation between the 'LTVCluster' column and all other columns in the 'tx_class' dataframe. It then displays the correlations in descending order, showing the variables that are most positively or negatively correlated with the 'LTVCluster' column.
#calculate and show correlations
corr_matrix = tx_class.corr()
corr_matrix['LTVCluster'].sort_values(ascending=False)
LTVCluster 1.000000 m6_Revenue 0.878053 Revenue 0.775567 RevenueCluster 0.606487 Frequency 0.567214 OverallScore 0.541769 FrequencyCluster 0.513975 Segment_High-Value 0.496868 RecencyCluster 0.358071 Segment_Mid-Value 0.189617 CustomerID -0.028556 Recency -0.349839 Segment_Low-Value -0.378708 Name: LTVCluster, dtype: float64
The code separates the feature set ('X') and the label ('y') from the 'tx_class' dataframe. It then splits the data into training and test sets using a 95:5 ratio, with 95% of the data assigned to the training set and 5% assigned to the test set.
#create X and y, X will be feature set and y is the label - LTV
X = tx_class.drop(['LTVCluster','m6_Revenue'],axis=1)
y = tx_class['LTVCluster']
#split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=56)
The code trains an XGBoost classifier model for multiclassification using the training data. It then calculates and prints the accuracy of the model on both the training set and the test set. Finally, it makes predictions on the test set using the trained model.
#XGBoost Multiclassification Model
ltv_xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1,n_jobs=-1).fit(X_train, y_train)
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(ltv_xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
.format(ltv_xgb_model.score(X_test[X_train.columns], y_test)))
y_pred = ltv_xgb_model.predict(X_test)
Accuracy of XGB classifier on training set: 0.95 Accuracy of XGB classifier on test set: 0.90
The code calculates and prints a classification report, which includes precision, recall, F1-score, and support metrics, comparing the predicted labels (y_pred) to the true labels (y_test) for evaluating the performance of the XGBoost classifier model.
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.94 0.94 0.94 145
1 0.76 0.80 0.78 44
2 0.83 0.71 0.77 7
accuracy 0.90 196
macro avg 0.85 0.82 0.83 196
weighted avg 0.90 0.90 0.90 196
8. Final Clusters for Customer Lifetime Value
If the model informs us that this consumer belongs to cluster 0, we will be 93 percent right (precision). Furthermore, the model correctly detects 95% of genuine cluster 0 consumers (recall).
We must significantly enhance the model for other clusters. We hardly detect 67% of Mid LTV consumers, for example.
Actions that might be taken to improve performance